library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(rvest)
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding

Web Scraping

Create a function that given the Amazon product ID retrives some valuable information (like product details and number of customer ratings)

amazon_product_info <- function(id) {
  url <- paste0("https://www.amazon.co.uk/dp/", id)
  html <- read_html(url)
  
  # product details no rank nor n. reviews 
  product_details = html %>% 
    html_element("#detailBullets_feature_div") %>% 
    html_element("[class='a-unordered-list a-nostyle a-vertical a-spacing-none detail-bullet-list']") %>% 
    html_text2()
  
  # only the number of customers ratings
  number_of_ratings = html %>% 
    html_element("#acrCustomerReviewText") %>% 
    html_text2()

  # Return a tibble
  tibble(product_details, number_of_ratings) %>%
    return()
}

After choosing a product, we use the previous function to obtain information about it.

id_prod = "0099908506" # id product
prod_info = amazon_product_info(id_prod)
prod_info
## # A tibble: 1 × 2
##   product_details                                               number_of_ratin…
##   <chr>                                                         <chr>           
## 1 "ASIN ‏ : ‎ 0099908506\nPublisher ‏ : ‎ Arrow; New Ed edition (1… 5,949 ratings

Scrape Product Reviews

Create a function to obtain the product reviews (title, text, review stars), considering both UK reviews and not from UK ones.

amazon_reviews <- function(id, page) {
  url <- paste0("https://www.amazon.co.uk/product-reviews/", # url using id and page variables 
                id, "/?pageNumber=", page)
  html <- read_html(url)
  
  # Review title (UK and not-UK)
  title = html %>%
    html_elements("[class='a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold']") %>%
    html_text2()
  
  title = title %>%
    c(html %>%
        html_elements("[class='a-size-base review-title a-color-base review-title-content a-text-bold']") %>%
        html_text2())
  
  # Review text (the same for UK and not-UK)
  text = html %>%
    html_elements("[class='a-size-base review-text review-text-content']") %>%
    html_text2()
  
  # Review stars (UK and not-UK)
  star = html %>%
    html_elements("[data-hook='review-star-rating']") %>%
    html_text2()
  
  star = star %>%
    c(html %>%
        html_elements("[data-hook='cmps-review-star-rating']") %>%
        html_text2())
  
  # Return a tibble
  tibble(title, text, star, page = page) %>%
    return()
}

With map_df function from the purrr package we can iterate the task over multiple pages to create a dataframe.

library(purrr)
page = 1:30
prod_rev = map_df(page, ~amazon_reviews(id_prod, page = .))

prod_rev$doc_id = 1:nrow(prod_rev)  # we also add a doc_id and we save the results 
head(prod_rev)
## # A tibble: 6 × 5
##   title                                                 text  star   page doc_id
##   <chr>                                                 <chr> <chr> <int>  <int>
## 1 Flawed, dated, but still brilliant                    "I f… 5.0 …     1      1
## 2 Alcoholic's Twaddle                                   "My … 2.0 …     1      2
## 3 I suppose ol’ Ernie got better after this…            "Thi… 2.0 …     1      3
## 4 An Old Classic That Stands the Test of Time           "It … 5.0 …     1      4
## 5 Moonshine                                             "An … 3.0 …     1      5
## 6 Heavy on style, light on plot, light on fidelity to … "Par… 3.0 …     1      6

Data Cleaning and Pre-Processing

Language Detection

Consider only English written reviews

library(cld2) # if the language cannot be determined it returns NA.

prod_rev$title_lang = detect_language(prod_rev$title)
prod_rev$text_lang = detect_language(prod_rev$text)
table(prod_rev$text_lang, prod_rev$title_lang, useNA = "always") # compare the results using table  
##       
##         de  en  es  fr  mg  pt <NA>
##   ca     0   0   0   0   0   1    0
##   de     1   1   0   0   0   0    1
##   en     0 168   0   0   1   0   82
##   es     0   1   2   0   0   0    3
##   fr     0   1   0   5   0   0    3
##   it     0   1   0   0   0   0    2
##   pt     0   0   1   0   0   2    0
##   <NA>   0   0   0   0   0   0   24
prod_rev = prod_rev %>% 
  filter(text_lang == "en") # select only reviews in english
prod_rev
## # A tibble: 251 × 7
##    title                           text  star   page doc_id title_lang text_lang
##    <chr>                           <chr> <chr> <int>  <int> <chr>      <chr>    
##  1 Flawed, dated, but still brill… "I f… 5.0 …     1      1 en         en       
##  2 Alcoholic's Twaddle             "My … 2.0 …     1      2 en         en       
##  3 I suppose ol’ Ernie got better… "Thi… 2.0 …     1      3 en         en       
##  4 An Old Classic That Stands the… "It … 5.0 …     1      4 en         en       
##  5 Moonshine                       "An … 3.0 …     1      5 <NA>       en       
##  6 Heavy on style, light on plot,… "Par… 3.0 …     1      6 en         en       
##  7 A VINTAGE NOVEL                 "Thi… 4.0 …     1      7 <NA>       en       
##  8 Nothing happens                 "A s… 2.0 …     1      8 en         en       
##  9 Tedious, lacking direction and… "I'v… 1.0 …     1      9 en         en       
## 10 More style than subsance        "A h… 4.0 …     1     10 en         en       
## # … with 241 more rows

SCORE

Extract a numeric score from the stars string

# Convert stars from string to numeric
prod_rev = prod_rev %>% 
  mutate(score = as.numeric(substring(star, 1, 1)))
summary(prod_rev$score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0     3.0     4.0     3.9     5.0     5.0
# Compute distribution of stars + visualization
prod_rev %>% 
  count(score) %>% 
  mutate(p = round(n/sum(n), 2))
## # A tibble: 5 × 3
##   score     n     p
##   <dbl> <int> <dbl>
## 1     1    18  0.07
## 2     2    25  0.1 
## 3     3    39  0.16
## 4     4    51  0.2 
## 5     5   118  0.47
prod_rev %>%
  ggplot(aes(x = score)) + geom_bar(aes(y = (..count..)), fill = "steelblue") + 
  labs(title = "Amazon reviews' stars", x = "Stars", y = "Number of comments") + 
  theme_bw() + 
  theme(plot.title = element_text(color = "steelblue", size = 12, face = "bold"), 
        plot.subtitle = element_text(color = "steelblue2"))

It appears that positive reviews prevail. From the 5 class score, we can tranform it to a binary classification: if the reviews has 4 or 5 starts it is positive, otherwise it is negative.

# Binary variable creation
prod_rev = prod_rev %>% 
  mutate(star_sent = ifelse(star>=4, "positive", "negative"))
# Binary variable's distribution
prod_rev %>% 
  count(star_sent) %>% 
  mutate(p = round(n/sum(n), 2))
## # A tibble: 2 × 3
##   star_sent     n     p
##   <chr>     <int> <dbl>
## 1 negative     82  0.33
## 2 positive    169  0.67

We can also compare some features differences between positive and negative reviews (like length of text).

prod_rev$nchar = str_length(prod_rev$text)
ggplot(prod_rev, aes(x = star_sent, y = nchar, fill = star_sent)) + 
  geom_boxplot() +
  theme_bw() +
  scale_fill_manual(values = c("steelblue", "skyblue"))

Text Cleaning

In order to conduct a better analysis we need to clean the text data making it easier to work with. Stop-words (customized in our case), upper-case letters, punctuaction and digits are dropped.

We create are own custom-stopwords because we have a problem with the ’ symbol (it isn’t detect when it appear as ’) with the filtering option we would delete even some non-stop-words so we create custom_stopwords.

library(tidytext) 
# Create our custom stop-words
custom_stopwords = bind_rows(
  tibble(word = c(
    "t’s","i’m","you’re","he’s","she’s","it’s","we’re","they’re","i’ve","you’ve","we’ve","they’ve","i’d","you’d","he’d","she’d",
    "we’d","they’d","i’ll","you’ll","he’ll", "she’ll","we’ll","they’ll","isn’t","aren’t","wasn’t","weren’t","hasn’t",
    "haven’t","hadn’t","doesn’t","don’t","didn’t","won’t","wouldn’t","shan’t","shouldn’t","can’t","cannot","couldn’t","mustn’t",
    "let’s","that’s","who’s", "what’s","here’s","there’s","when’s","where’s","why’s","how’s","a’s","ain’t", "c’s","c’mon"),
    lexicon = "custom"), stop_words) 
# Filter out unwanted words and symbols
tidy_text = prod_rev %>% 
  unnest_tokens(word, text) %>% 
  anti_join(custom_stopwords) %>% 
  filter(!str_detect(word, "^([[:digit:]]+)$")) %>%  # filter for numbers (~130 words)
  filter(!str_detect(word, "^([[:alnum:]]+)[.,]([[:alnum:]]+)")) # filter for numbers with decimal (few words) 
## Joining, by = "word"
                                                                 # + word.word(mistakes in punctuation)(~300 words)
# Look at some frequent terms
freq.df = tidy_text %>%
  count(word, sort = T)
head(freq.df, 20)
## # A tibble: 20 × 2
##    word            n
##    <chr>       <int>
##  1 book          193
##  2 hemingway     174
##  3 read          125
##  4 brett          94
##  5 jake           85
##  6 characters     72
##  7 paris          71
##  8 story          69
##  9 time           67
## 10 hemingway's    56
## 11 reading        56
## 12 spain          55
## 13 style          55
## 14 love           53
## 15 war            50
## 16 pamplona       46
## 17 life           45
## 18 bull           42
## 19 fiesta         39
## 20 writing        39

Word Normalization

For word normalization we could use either stemming or lemmatization. The goal of both methods is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. For our analysis we use Stemming, which is the process of reducing the word to its root eliminating the suffix.

# STEMMING
library(SnowballC)
tidy_stem = tidy_text %>%
  mutate(word = wordStem(word))

# LEMMATIZATION
library(udpipe)
## Warning: package 'udpipe' was built under R version 4.0.5
tidy_lemma <- udpipe(prod_rev, "english-gum")
tidy_lemma = tidy_lemma %>%
  mutate(stem = wordStem(token)) %>%
  tibble()

tidy_lemma # table and the differences between token (word) lemmas and stems:
## # A tibble: 26,236 × 18
##    doc_id paragraph_id sentence_id sentence   start   end term_id token_id token
##    <chr>         <int>       <int> <chr>      <int> <int>   <int> <chr>    <chr>
##  1 1                 1           1 I finishe…     1     1       1 1        I    
##  2 1                 1           1 I finishe…     3    10       2 2        fini…
##  3 1                 1           1 I finishe…    12    15       3 3        this 
##  4 1                 1           1 I finishe…    17    21       4 4        novel
##  5 1                 1           1 I finishe…    23    25       5 5        for  
##  6 1                 1           1 I finishe…    27    29       6 6        the  
##  7 1                 1           1 I finishe…    31    36       7 7        seco…
##  8 1                 1           1 I finishe…    38    41       8 8        time 
##  9 1                 1           1 I finishe…    43    46       9 9        last 
## 10 1                 1           1 I finishe…    48    52      10 10       night
## # … with 26,226 more rows, and 9 more variables: lemma <chr>, upos <chr>,
## #   xpos <chr>, feats <chr>, head_token_id <chr>, dep_rel <chr>, deps <chr>,
## #   misc <chr>, stem <chr>
tidy_lemma %>%
  select(token, lemma, stem)
## # A tibble: 26,236 × 3
##    token    lemma  stem  
##    <chr>    <chr>  <chr> 
##  1 I        I      I     
##  2 finished finish finish
##  3 this     this   thi   
##  4 novel    novel  novel 
##  5 for      for    for   
##  6 the      the    the   
##  7 second   second second
##  8 time     time   time  
##  9 last     last   last  
## 10 night    night  night 
## # … with 26,226 more rows

Dictionary-based Sentiment Analysis

Tidy Approach

We first consider the tidy approach, where we consider words as tokens. With this SA approach, we will use three lexicons: BING (gives words a positive or negative sentiment), AFINN (rates words with a value from -5 to +5), and NRC (labels words with six possible sentiments or emotions).The procedure for each of these lexicons is similar, but the results are dependent on the lexicon itself. With every specific lexicon, we are able to give a sentiment or value to (almost) each word, and then we compute the value of each review as an aggregation of the contained words’ values/sentiment. We later plot our results using histograms.

Bing Lexicon
bing = get_sentiments("bing")
# Get sentiment score 
prod_rev_bing = tidy_text %>%
  select(doc_id, word) %>%
  inner_join(bing) %>%
  count(doc_id, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(bing = positive - negative)
## Joining, by = "word"
prod_rev = prod_rev %>%
  left_join(prod_rev_bing %>%
              select(doc_id, bing))
## Joining, by = "doc_id"
hist(prod_rev$bing, col = "red", main = "Sentiment distribution - tidy- bing lexicon") 

summary(prod_rev$bing)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -25.0000  -1.0000   0.0000  -0.3005   1.5000  12.0000       48
# Analyze different words' contribution to the sentiment.
bing_word_counts <- tidy_text %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 5, with_ties = F) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) + 
  geom_col(show.legend = F) + 
  facet_wrap(~sentiment, scales = "free_y") + 
  labs(x = "Contribution to sentiment", y = NULL) +
  theme_bw() + scale_fill_manual(values = c("steelblue","skyblue"))

We can also plot a word-cloud. The color represent the sentiment associated to a particular word, while the size of each word depends on the its frequency.

library(wordcloud)
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 4.0.5
library(wordcloud2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_text %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("orangered", "darkgreen"), max.words = 100)
## Joining, by = "word"

AFINN LEXICON

We conduct the same analysis as before, using the AFINN lexicon (need for few arrangments).

afinn = get_sentiments("afinn")
# Get sentiment score 
prod_rev_afinn = tidy_text %>%
  select(doc_id, word) %>%
  inner_join(afinn) %>%
  group_by(doc_id) %>% 
  summarise(afinn = sum(value))
## Joining, by = "word"
prod_rev = prod_rev %>%
  left_join(prod_rev_afinn %>%
              select(doc_id, afinn))
## Joining, by = "doc_id"
hist(prod_rev$afinn, col = "blue", main = "Sentiment distribution - tidy - afinn lexicon") 

summary(prod_rev$afinn)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -26.000  -2.000   3.000   1.753   5.000  30.000      69
# Let's see the contribution of words to the sentiment.
afinn_word_counts <- tidy_text %>%
  inner_join(get_sentiments("afinn")) %>%
  count(word, value, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
afinn_word_counts %>%
  group_by(value) %>%
  slice_max(n, n = 5, with_ties = F) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = value)) + geom_col(show.legend = F) + 
  facet_wrap(~value, scales = "free_y") + labs(x = "Contribution to sentiment", 
                                                   y = NULL) 

NRC LEXICON

We conduct the same analysis as before, using the NRC lexicon (need for few arrangments).

nrc = get_sentiments("nrc")
# Get sentiment score 
prod_rev_nrc = tidy_text %>%
  select(doc_id, word) %>%
  inner_join(nrc) %>%
  count(doc_id, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(nrc = positive - negative)
## Joining, by = "word"
prod_rev = prod_rev %>%
  left_join(prod_rev_nrc %>%
              select(doc_id, nrc))
## Joining, by = "doc_id"
hist(prod_rev$nrc, col = "yellow", main = "Sentiment distribution - tidy - nrc lexicon") 

summary(prod_rev$nrc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -10.000   0.000   1.000   2.363   4.000  28.000      36
# Let's see the contribution of words to the sentiment.
nrc_word_counts <- tidy_text %>%
  inner_join(get_sentiments("nrc")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
nrc_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 5, with_ties = F) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) + geom_col(show.legend = FALSE) + 
  facet_wrap(~sentiment, scales = "free_y") + labs(x = "Contribution to sentiment", 
                                                   y = NULL) + theme_bw() 

Comparison Between Different Lexicons

Lexicons histogram - comparing sentiment distribution using different lexicons

prod_rev %>% 
  ggplot() + 
  geom_histogram(aes(x = bing, fill = "b"), bins = 40, alpha = 0.5) +
  geom_histogram(aes(x = afinn, fill = "a"), bins = 40, alpha = 0.5) +
  geom_histogram(aes(x = nrc, fill = "n"), bins = 40 , alpha = 0.5) +
  scale_fill_manual(name ="lexicon", values = c("b" = "red", "a" = "blue", "n" = "yellow"),
                    labels=c("b" = "bing", "a" = "afinn", "n" = "nrc")) +
  labs(title= "Sentiment Distribution using all 3 lexicons", y = "Frequency", x = "Sentiment")
## Warning: Removed 48 rows containing non-finite values (stat_bin).
## Warning: Removed 69 rows containing non-finite values (stat_bin).
## Warning: Removed 36 rows containing non-finite values (stat_bin).

Word count sentiments - compare most common positive/negative words (considering different lexicons)

#BING 
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 5, with_ties = F) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) + 
  geom_col(show.legend = F) + 
  facet_wrap(~sentiment, scales = "free_y") + 
  labs(x = "Contribution to sentiment - BING", y = NULL) +
  theme_bw() + scale_fill_manual(values = c("red4","red"))

# AFINN
afinn_word_counts %>%
  group_by(sentiment = ifelse(value>0, "positive", "negative")) %>%
  slice_max(n, n = 5, with_ties = F) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) + 
  geom_col(show.legend = F) + 
  facet_wrap(~sentiment, scales = "free_y") + 
  labs(x = "Contribution to sentiment - AFINN", y = NULL) +
  theme_bw() + scale_fill_manual(values = c("steelblue","skyblue"))

# NRC
nrc_word_counts %>%
  filter(sentiment %in% c("positive", "negative")) %>% 
  group_by(sentiment) %>%
  slice_max(n, n = 5, with_ties = F) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) + geom_col(show.legend = FALSE) + 
  facet_wrap(~sentiment, scales = "free_y") + labs(x = "Contribution to sentiment - NRC", y = NULL) + theme_bw() +
  scale_fill_manual(values = c("goldenrod3","gold")) 

Udpipe Approach

With this approach, we also consider polarity negators and polarity amplifiers (we will consider the previous 2 words, not following words). The performance increases when we consider them both. However, also this approach is not free from possible problems, there is some situation in which the approach under-perform the previous one. We can use lemmas or words in the analysis and we can use one from the three lexicons (will not drop stop-words).

Bing Lexicon with Lemmas
library(udpipe)
data_udpipe <- udpipe(prod_rev, "english-gum")

bing_dict = get_sentiments("bing") %>%
  mutate(sentiment = ifelse(sentiment == "negative", -1, 1)) %>%
  rename(term = "word", polarity = "sentiment")

scores_b <- txt_sentiment(x = data_udpipe, 
                        term = "lemma", #in this case we use lemmas instead of words 
                        polarity_terms = bing_dict, #we also not dropping stop-words
                        polarity_negators = "not", #there 'll be some difference 
                        polarity_amplifiers = "very", 
                        n_before = 2, 
                        n_after = 0,
                        constrain = F)
prod_rev$udpipe_bing_l = scores_b$overall$sentiment_polarity

summary(prod_rev$udpipe_bing_l)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -15.0000   0.0000   1.0000   0.9665   2.0000  14.0000
summary(prod_rev$bing)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -25.0000  -1.0000   0.0000  -0.3005   1.5000  12.0000       48
# Compare distributions between the tidy approach using Bing and the udpipe approach using the same lexicon (and lemmas)
par(mfrow = c(1, 2))
hist(scale(prod_rev$bing), col = "lightblue", main = "Sentiment distribution-bing")
hist(scale(prod_rev$udpipe_bing_l), col = "lightblue", main = "udpipe (bing dict-lemmas)")

Bing Lexicon with Words
scores_c <- txt_sentiment(x = data_udpipe, 
                        term = "token", #in this case we use lemmas instead of words 
                        polarity_terms = bing_dict, #we also not dropping stop-words
                        polarity_negators = c("not"), #there'll be some difference 
                        polarity_amplifiers = c("very"), 
                        n_before = 2,
                        n_after = 0,
                        constrain = F)
prod_rev$udpipe_bing_w = scores_c$overall$sentiment_polarity
summary(prod_rev$udpipe_bing_w)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -11.2000   0.0000   1.0000   0.9474   2.0000  13.0000
# Compare distributions between the tidy approach using Bing, the udpipe approach with lemmas, and the udpipe approach with words
par(mfrow = c(1, 3))
hist(scale(prod_rev$bing), col = "lightblue", main = "Sentiment distribution - bing") # tidy approach 
hist(scale(prod_rev$udpipe_bing_w), col = "lightblue", main = "udpipe (bing dict) - words") # udpipe approach with words 
hist(scale(prod_rev$udpipe_bing_l), col = "lightblue", main = "udpipe (bing dict-lemmas)") # udpipe with lemmas 

We can repeat all this process for all the other lexicons.

Afinn Lexicon with Lemmas
afinn_dict = get_sentiments("afinn") %>%
  rename(term = "word", polarity = "value")

data_udpipe <- udpipe(prod_rev, "english-gum")

scores_a <- txt_sentiment(x = data_udpipe, 
                        term = "lemma", #in this case we use lemmas instead of words 
                        polarity_terms = afinn_dict, #we also not dropping stop-words
                        polarity_negators = c("not"), #there'll be some difference 
                        polarity_amplifiers = c("very"), 
                        n_before = 2,
                        n_after = 0,
                        constrain = F)
prod_rev$udpipe_afinn_l = scores_a$overall$sentiment_polarity
hist(prod_rev$udpipe_afinn_l, col = "lightblue", main = "Sentiment distribution - udpipe (afinn dict - lemmas)")

summary(prod_rev$udpipe_afinn_l)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -13.000   0.000   3.000   4.683   6.800  52.600
summary(prod_rev$afinn)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -26.000  -2.000   3.000   1.753   5.000  30.000      69
# Compare distributions between the tidy approach using Afinn and the udpipe approach using the same lexicon (and lemmas)
par(mfrow = c(1, 2))
hist(scale(prod_rev$afinn), col = "lightblue", main = "Sentiment distribution - afinn")
hist(scale(prod_rev$udpipe_afinn_l), col = "lightblue", main = "Sentiment distribution - udpipe (afinn dict - lemmas)")

Afinn Lexicon with Words
scores_a <- txt_sentiment(x = data_udpipe, 
                        term = "token", #in this case we use lemmas instead of words 
                        polarity_terms = afinn_dict, #we also not dropping stop-words
                        polarity_negators = c("not"), #there'll be some difference 
                        polarity_amplifiers = c("very"), 
                        n_before = 2,
                        n_after = 0,
                        constrain = F)
prod_rev$udpipe_afinn_w = scores_a$overall$sentiment_polarity
hist(prod_rev$udpipe_afinn_w, col = "lightblue", main = "Sentiment distribution - udpipe (afinn dict - words)")

summary(prod_rev$udpipe_afinn_w)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -13.00    0.00    3.00    4.24    6.00   57.00
# Compare distributions between the tidy approach using Afinn, the udpipe approach with lemmas, and the udpipe approach with words
par(mfrow = c(1, 3))
hist(scale(prod_rev$afinn), col = "lightblue", main = "Sentiment distribution - afinn")
hist(scale(prod_rev$udpipe_afinn_l), col = "lightblue", main = "Sentiment distribution - udpipe (afinn dict - lemmas)")
hist(scale(prod_rev$udpipe_afinn_w), col = "lightblue", main = "Sentiment distribution - udpipe (afinn dict - words)")

NRC Lexicon with Lemmas
nrc_dict = get_sentiments("nrc") %>%
  mutate(sentiment = ifelse(sentiment == "negative", -1, 1)) %>%
  rename(term = "word", polarity = "sentiment")

data_udpipe <- udpipe(prod_rev, "english-gum")

scores_n <- txt_sentiment(x = data_udpipe, 
                        term = "lemma", #in this case we use lemmas instead of words 
                        polarity_terms = nrc_dict, #we also not dropping stop-words
                        polarity_negators = c("not"), #there'll be some difference 
                        polarity_amplifiers = c("very"), 
                        n_before = 2,
                        n_after = 0,
                        constrain = F)
prod_rev$udpipe_nrc_l = scores_n$overall$sentiment_polarity
summary(prod_rev$udpipe_nrc_l)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.000   1.000   3.000   7.363   6.500  79.000
summary(prod_rev$nrc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -10.000   0.000   1.000   2.363   4.000  28.000      36
# Compare distributions between the tidy approach using NRC and the udpipe approach using the same lexicon (and lemmas)
par(mfrow = c(1, 2))
hist(scale(prod_rev$nrc), col = "lightblue", main = "Sentiment distribution - nrc")
hist(scale(prod_rev$udpipe_nrc_l), col = "lightblue", main = "Sentiment distribution - udpipe (nrc dict - lemmas)")

NRC Lexicon with Lemmas
scores_n <- txt_sentiment(x = data_udpipe, 
                        term = "token", #in this case we use lemmas instead of words 
                        polarity_terms = nrc_dict, #we also not dropping stop-words
                        polarity_negators = c("not"), #there'll be some difference 
                        polarity_amplifiers = c("very"), 
                        n_before = 2,
                        n_after = 0,
                        constrain = F)
prod_rev$udpipe_nrc_w = scores_n$overall$sentiment_polarity
hist(prod_rev$udpipe_nrc_w, col = "lightblue", main = "Sentiment distribution - udpipe (nrc dict - words)")

summary(prod_rev$udpipe_nrc_w)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.000   1.000   2.000   6.268   6.400  61.000
# Compare distributions between the tidy approach using Afinn, the udpipe approach with lemmas, and the udpipe approach with words
par(mfrow = c(1, 3))
hist(scale(prod_rev$nrc), col = "lightblue", main = "Sentiment distribution - nrc")
hist(scale(prod_rev$udpipe_nrc_l), col = "lightblue", main = "Sentiment distribution - udpipe (nrc dict - lemmas)")
hist(scale(prod_rev$udpipe_nrc_w), col = "lightblue", main = "Sentiment distribution - udpipe (nrc dict - words)")

Does the Sentiment reflect reviews’ stars?

Using udpipe approach considering lemmas only, for different lexicons.
# BING
prod_rev %>%
  select(doc_id, star_sent, udpipe_bing_l, bing) %>%
  mutate(star_sent = ifelse(star_sent == "positive", 1, -1), 
         udpipe_bing_l = ifelse(udpipe_bing_l > 0, 1, ifelse(udpipe_bing_l < 0, -1, 0)), 
         bing = ifelse(bing > 0, 1, ifelse(bing < 0, -1, 0)), bing = replace_na(bing, 0)
         ) %>%
  pivot_longer(cols = c("star_sent", "udpipe_bing_l", "bing")) %>%
  ggplot(aes(doc_id, value, fill = name)) + geom_col(show.legend = FALSE) + 
  facet_wrap(~name, ncol = 1, scales = "free_y", strip.position = "right") + 
  theme_bw() + scale_fill_manual(values = c("deepskyblue2", "steelblue", "deepskyblue2")) + ggtitle('Compare: Tidy SA, Udpipe SA with lemmas, and Reviews stars. (Using BING)')

# AFINN
prod_rev %>%
  select(doc_id, star_sent, udpipe_afinn_l, afinn) %>%
  mutate(star_sent = ifelse(star_sent == "positive", 1, -1), 
         udpipe_afinn_l = ifelse(udpipe_afinn_l > 0, 1, ifelse(udpipe_afinn_l < 0, -1, 0)), 
         afinn = ifelse(afinn > 0, 1, ifelse(afinn < 0, -1, 0)), afinn = replace_na(afinn, 0)) %>%
  pivot_longer(cols = c("star_sent", "udpipe_afinn_l", "afinn")) %>%
  ggplot(aes(doc_id, value, fill = name)) + geom_col(show.legend = FALSE) + 
  facet_wrap(~name, ncol = 1, scales = "free_y", strip.position = "right") + 
  theme_bw() + scale_fill_manual(values = c("deepskyblue2", "steelblue", "deepskyblue2")) + ggtitle('Compare: Tidy SA, Udpipe SA with lemmas, and Reviews stars. (Using AFINN)')

#NRC
prod_rev %>%
  select(doc_id, star_sent, udpipe_nrc_l, nrc) %>%
  mutate(star_sent = ifelse(star_sent == "positive", 1, -1), 
         udpipe_nrc_l = ifelse(udpipe_nrc_l > 0, 1, ifelse(udpipe_nrc_l < 0, -1, 0)), 
         nrc = ifelse(nrc > 0, 1, ifelse(nrc < 0, -1, 0)), nrc = replace_na(nrc, 0)) %>%
  pivot_longer(cols = c("star_sent", "udpipe_nrc_l", "nrc")) %>%
  ggplot(aes(doc_id, value, fill = name)) + geom_col(show.legend = FALSE) + 
  facet_wrap(~name, ncol = 1, scales = "free_y", strip.position = "right") + 
  theme_bw() + scale_fill_manual(values = c("deepskyblue2", "steelblue", "deepskyblue2"))+ ggtitle('Compare: Tidy SA, Udpipe SA with lemmas, and Reviews stars. (Using NRC)')

In all these cases there are some differences. We can also compare the sentiments with the star score (pretending that it is the true one). Notice how these results strongly depends on the pre-pocessing phase (for the tidy approach we eliminated stropwords, for the udpipe one we considered lemmas instead of words and we didn’t remove stopwords).

Visualizations

Unigram

We start by looking at the most frequent stems in the whole corpus (all the documents).

tidy_stem %>%
  count(word) %>%
  slice_max(n, n = 10) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = word)) + geom_bar(stat = "identity", fill = "skyblue") + 
  xlab(NULL) + labs(title = "Most common stems in reviews", y = "Stems count") +
  theme(legend.position = "none", plot.title = element_text(color = "steelblue", size = 12, face = "bold")) +
  coord_flip() + theme_bw()

Then, we can compare the stems used by people who wrote positive and negative reviews respectively.

tidy_stem %>%
  group_by(star_sent) %>%
  count(word) %>%
  group_by(star_sent) %>%
  slice_max(n, n = 10, with_ties = F) %>%
  mutate(star_sent = as.factor(star_sent), word = reorder_within(word,n, star_sent)) %>%
  ggplot(aes(word, n, fill = star_sent)) +
  geom_col(show.legend = FALSE) + 
  facet_wrap(~star_sent, scales = "free_y") + 
  coord_flip() +
  labs(title = "Most common stems in positive/negative reviews",y = NULL, x = "N") +
  scale_x_reordered() + theme(legend.position = "none",plot.title = element_text(color = "orangered", "dodgerblue")) +
  scale_fill_manual(values = c("orangered", "dodgerblue")) + theme_bw()

In order to show which stems are important but specific to each cateogory we can provide different visualization/scores.

We use a geom_jitter to compare the frequency of stems in positive and negative comments. The stems which lie near to the red line are used with about the same frequency in the two categories.

tidy_stem %>%
  group_by(star_sent) %>%
  count(word, sort = T) %>%
  mutate(prop = n/sum(n)) %>%
  select(star_sent, word, prop) %>%
  pivot_wider(names_from = star_sent, values_from = prop) %>%
  arrange(positive, negative) %>%
  ggplot(aes(positive, negative)) + 
  geom_jitter(alpha = 0.5,size = 2.5, width = 0.25, height = 0.25, colour = "steelblue") +
  geom_text(aes(label = word), check_overlap = T, vjust = 0) +
  scale_x_log10() +
  scale_y_log10() +
  geom_abline(color = "red") + theme_bw()
## Warning: Removed 1799 rows containing missing values (geom_point).
## Warning: Removed 1799 rows containing missing values (geom_text).

We can compare the log-odds ratio to understand which words are more or less likely to come from each cateogry of reviews (positive or negative).

word_ratios <- tidy_stem %>%
  count(word, star_sent) %>%
  group_by(word) %>%
  filter(sum(n) >= 10) %>%
  ungroup() %>%
  pivot_wider(names_from = star_sent, values_from = n, values_fill = 0) %>%
  mutate_if(is.numeric, list(~(. + 1)/(sum(.) + 1))) %>%
  mutate(logratio = log(positive/negative)) %>%
  arrange(desc(logratio))

word_ratios %>%
  group_by(logratio < 0) %>%
  slice_max(abs(logratio), n = 15) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() + ylab("log odds ratio (Positive/Negative)") +
  scale_fill_manual(name = "", labels = c("Positive", "Negative"),values = c("dodgerblue", "orangered")) + theme_bw()

We can also can plot some wordclouds.

tidy_stem %>%
  count(word) %>%
  with(wordcloud(scale = c(5, 0.7), word, n, max.words = 100,
                 min.freq = 2, random.order = F, rot.per = 0.15, colors = brewer.pal(8, "Paired")))

# we use the words instead of the stems and the wordcloud2 package.
frame = tidy_text %>%
  count(word, sort = T)
frame = data.frame(word = frame$word, freq = frame$n)
wordcloud2(frame, color = "skyblue")
Bigrams

We can show some of the previous plots also for bigrams. Let’s consider a new type of visualization. More precisely, if you are interested in the relationship between words, it is useful to consider a network (with also the “direction” of the link).

library(ggraph)
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
tidy_big_stem <- prod_rev %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  mutate(word1 = wordStem(word1)) %>%
  mutate(word2 = wordStem(word2))

bigram_counts = tidy_big_stem %>%
  count(word1, word2, sort = TRUE)
bigram_graph <- bigram_counts %>%
  filter(n >= 2) %>% 
  graph_from_data_frame()


set.seed(9265)
a <- grid::arrow(type = "closed", length = unit(0.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = circle(1, "inches")) +
  geom_node_point(color = "skyblue", size = 3) + 
  geom_node_text(aes(label =name), vjust = 1, hjust = 1) + theme_void()

Bigrams Using POS
cooc <- cooccurrence(tidy_lemma$lemma, relevant = tidy_lemma$upos %in% c("NOUN", "ADJ"), skipgram = 1)
head(cooc)
##   term1      term2 cooc
## 1  bull      fight   13
## 2 first      novel   10
## 3  main  character    9
## 4 first       book    9
## 5 other       book    8
## 6  lost generation    7
wordnetwork <- head(cooc, 15)
wordnetwork <- graph_from_data_frame(wordnetwork) 

ggraph(wordnetwork, layout = "fr") +
  geom_edge_link(aes(width = cooc,edge_alpha = cooc), edge_colour = "skyblue") +
  geom_node_text(aes(label = name),col = "darkblue", size = 4) +
  theme_void() + labs(title = "Words following one another",subtitle = "Nouns & Adjective")

Co-Occurency Using POS
cooc <- cooccurrence(x = subset(tidy_lemma, upos %in% c("NOUN", "ADJ")), term = "lemma", group = c("doc_id"))
head(cooc)
##   term1     term2 cooc
## 1  book      bull  285
## 2  book character  240
## 3  book     novel  232
## 4  book      time  163
## 5  book     other  162
## 6  book       man  155
wordnetwork <- head(cooc, 30)
wordnetwork <- graph_from_data_frame(wordnetwork) 

ggraph(wordnetwork, layout = "fr") +
  geom_edge_link(aes(width = cooc,edge_alpha = cooc), edge_colour = "skyblue") +
  geom_node_text(aes(label = name),col = "darkblue", size = 4) +
  theme(legend.position = "none") + theme_void() + 
  labs(title = "Cooccurrences within documents", subtitle = "Nouns & Adjective")

Dependency Parsing
library(textplot)
textplot_dependencyparser(tidy_lemma %>%filter(doc_id == "1" & sentence_id == "1"))